#ifndef CUFFTDX_FFT_16_FP64_INV_PTX_HPP
#define CUFFTDX_FFT_16_FP64_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<600, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .f64 fd<240>;
.reg .b64 rd<2>;
add.f64 fd65, %32, %53;
add.f64 fd66, %33, %55;
sub.f64 fd67, %32, %53;
sub.f64 fd68, %33, %55;
add.f64 fd69, %42, %64;
add.f64 fd70, %44, %65;
sub.f64 fd71, %42, %64;
sub.f64 fd72, %44, %65;
add.f64 fd73, fd65, fd69;
add.f64 fd74, fd66, fd70;
sub.f64 fd75, fd65, fd69;
sub.f64 fd76, fd66, fd70;
sub.f64 fd77, fd67, fd72;
add.f64 fd78, fd68, fd71;
add.f64 fd79, fd67, fd72;
sub.f64 fd80, fd68, fd71;
add.f64 fd81, %37, %58;
add.f64 fd82, %39, %60;
sub.f64 fd83, %37, %58;
sub.f64 fd84, %39, %60;
add.f64 fd85, %48, %69;
add.f64 fd86, %49, %71;
sub.f64 fd87, %48, %69;
sub.f64 fd88, %49, %71;
add.f64 fd89, fd81, fd85;
add.f64 fd90, fd82, fd86;
sub.f64 fd91, fd81, fd85;
sub.f64 fd92, fd82, fd86;
sub.f64 fd93, fd83, fd88;
add.f64 fd94, fd84, fd87;
add.f64 fd95, fd83, fd88;
sub.f64 fd96, fd84, fd87;
mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD;
mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD;
sub.f64 fd99, fd97, fd98;
add.f64 fd100, fd97, fd98;
mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD;
mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD;
sub.f64 fd103, fd101, fd102;
mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104;
add.f64 fd106, fd73, fd89;
add.f64 fd107, fd74, fd90;
sub.f64 fd108, fd73, fd89;
sub.f64 fd109, fd74, fd90;
add.f64 fd110, fd77, fd99;
add.f64 fd111, fd78, fd100;
sub.f64 fd112, fd77, fd99;
sub.f64 fd113, fd78, fd100;
sub.f64 fd114, fd75, fd92;
add.f64 fd115, fd76, fd91;
add.f64 fd116, fd75, fd92;
sub.f64 fd117, fd76, fd91;
add.f64 fd118, fd79, fd103;
add.f64 fd119, fd80, fd105;
sub.f64 fd120, fd79, fd103;
sub.f64 fd121, fd80, fd105;
add.f64 fd122, %34, %56;
add.f64 fd123, %36, %57;
sub.f64 fd124, %34, %56;
sub.f64 fd125, %36, %57;
add.f64 fd126, %45, %66;
add.f64 fd127, %47, %68;
sub.f64 fd128, %45, %66;
sub.f64 fd129, %47, %68;
add.f64 fd130, fd122, fd126;
add.f64 fd131, fd123, fd127;
sub.f64 fd132, fd122, fd126;
sub.f64 fd133, fd123, fd127;
sub.f64 fd134, fd124, fd129;
add.f64 fd135, fd125, fd128;
add.f64 fd136, fd124, fd129;
sub.f64 fd137, fd125, fd128;
add.f64 fd138, %40, %61;
add.f64 fd139, %41, %63;
sub.f64 fd140, %40, %61;
sub.f64 fd141, %41, %63;
add.f64 fd142, %50, %72;
add.f64 fd143, %52, %73;
sub.f64 fd144, %50, %72;
sub.f64 fd145, %52, %73;
add.f64 fd146, fd138, fd142;
add.f64 fd147, fd139, fd143;
sub.f64 fd148, fd138, fd142;
sub.f64 fd149, fd139, fd143;
sub.f64 fd150, fd140, fd145;
add.f64 fd151, fd141, fd144;
add.f64 fd152, fd140, fd145;
sub.f64 fd153, fd141, fd144;
mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD;
mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD;
sub.f64 fd156, fd154, fd155;
add.f64 fd157, fd154, fd155;
mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD;
mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD;
sub.f64 fd160, fd158, fd159;
mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161;
add.f64 fd163, fd130, fd146;
add.f64 fd164, fd131, fd147;
sub.f64 fd165, fd130, fd146;
sub.f64 fd166, fd131, fd147;
add.f64 fd167, fd134, fd156;
add.f64 fd168, fd135, fd157;
sub.f64 fd169, fd134, fd156;
sub.f64 fd170, fd135, fd157;
sub.f64 fd171, fd132, fd149;
add.f64 fd172, fd133, fd148;
add.f64 fd173, fd132, fd149;
sub.f64 fd174, fd133, fd148;
add.f64 fd175, fd136, fd160;
add.f64 fd176, fd137, fd162;
sub.f64 fd177, fd136, fd160;
sub.f64 fd178, fd137, fd162;
mul.f64 fd179, fd167, 0d3FED906BCF328D46;
mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963;
sub.f64 fd181, fd179, fd180;
mul.f64 fd182, fd168, 0d3FED906BCF328D46;
fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182;
mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD;
mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD;
sub.f64 fd186, fd184, fd185;
add.f64 fd187, fd184, fd185;
mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963;
mul.f64 fd189, fd176, 0d3FED906BCF328D46;
sub.f64 fd190, fd188, fd189;
mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963;
fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191;
mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963;
mul.f64 fd194, fd170, 0d3FED906BCF328D46;
sub.f64 fd195, fd193, fd194;
mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963;
fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196;
mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD;
mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD;
sub.f64 fd200, fd198, fd199;
mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201;
mul.f64 fd203, fd177, 0dBFED906BCF328D46;
mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963;
sub.f64 fd205, fd203, fd204;
mul.f64 fd206, fd178, 0dBFED906BCF328D46;
fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206;
add.f64 %1, fd107, fd164;
add.f64 %0, fd106, fd163;
add.f64 %3, fd111, fd183;
add.f64 %2, fd110, fd181;
add.f64 %5, fd115, fd187;
add.f64 %4, fd114, fd186;
add.f64 %7, fd119, fd192;
add.f64 %6, fd118, fd190;
add.f64 %9, fd109, fd165;
sub.f64 %8, fd108, fd166;
add.f64 %11, fd113, fd197;
add.f64 %10, fd112, fd195;
add.f64 %13, fd117, fd202;
add.f64 %12, fd116, fd200;
add.f64 %15, fd121, fd207;
add.f64 %14, fd120, fd205;
sub.f64 %17, fd107, fd164;
sub.f64 %16, fd106, fd163;
sub.f64 %19, fd111, fd183;
sub.f64 %18, fd110, fd181;
sub.f64 %21, fd115, fd187;
sub.f64 %20, fd114, fd186;
sub.f64 %23, fd119, fd192;
sub.f64 %22, fd118, fd190;
sub.f64 %25, fd109, fd165;
add.f64 %24, fd108, fd166;
sub.f64 %27, fd113, fd197;
sub.f64 %26, fd112, fd195;
sub.f64 %29, fd117, fd202;
sub.f64 %28, fd116, fd200;
sub.f64 %31, fd121, fd207;
sub.f64 %30, fd120, fd205;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<601, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<14>;
.reg .f64 fd<93>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 8;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd17, %10, %15;
add.f64 fd18, %11, %17;
sub.f64 fd19, %10, %15;
sub.f64 fd20, %11, %17;
add.f64 fd21, %12, %18;
add.f64 fd22, %14, %19;
sub.f64 fd23, %12, %18;
sub.f64 fd24, %14, %19;
sub.f64 fd25, fd17, fd21;
sub.f64 fd26, fd18, fd22;
sub.f64 fd27, fd19, fd24;
add.f64 fd28, fd20, fd23;
add.f64 fd29, fd19, fd24;
sub.f64 fd30, fd20, fd23;
and.b32 r6, r5, 3;
shl.b32 r7, r5, 6;
and.b32 r8, r7, -256;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 4;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 48;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd31, fd32}, [rd5];
mul.f64 fd35, fd28, fd32;
mul.f64 fd36, fd27, fd32;
mul.f64 fd37, fd31, fd28;
mul.f64 fd38, fd31, fd31;
mul.f64 fd39, fd32, fd32;
sub.f64 fd40, fd38, fd39;
mul.f64 fd41, fd32, fd31;
fma.rn.f64 fd42, fd32, fd31, fd41;
mul.f64 fd43, fd26, fd42;
mul.f64 fd44, fd25, fd42;
mul.f64 fd45, fd40, fd26;
ld.global.v2.f64 {fd46, fd47}, [rd5+64];
mul.f64 fd50, fd30, fd47;
mul.f64 fd51, fd29, fd47;
mul.f64 fd52, fd46, fd30;
barrier.sync 0;
and.b32 r11, r7, 192;
add.s32 r12, r9, r11;
add.f64 fd53, fd18, fd22;
add.f64 fd54, fd17, fd21;
st.shared.v2.f64 [r12], {fd54, fd53};
fma.rn.f64 fd55, fd31, fd27, fd35;
sub.f64 fd56, fd37, fd36;
st.shared.v2.f64 [r12+16], {fd55, fd56};
sub.f64 fd57, fd45, fd44;
fma.rn.f64 fd58, fd40, fd25, fd43;
st.shared.v2.f64 [r12+32], {fd58, fd57};
fma.rn.f64 fd59, fd46, fd29, fd50;
sub.f64 fd60, fd52, fd51;
st.shared.v2.f64 [r12+48], {fd59, fd60};
barrier.sync 0;
mad.lo.s32 r13, r6, -48, r12;
ld.shared.v2.f64 {fd61, fd62}, [r13];
ld.shared.v2.f64 {fd65, fd66}, [r13+64];
ld.shared.v2.f64 {fd69, fd70}, [r13+128];
ld.shared.v2.f64 {fd73, fd74}, [r13+192];
add.f64 fd77, fd61, fd69;
add.f64 fd78, fd62, fd70;
sub.f64 fd79, fd61, fd69;
sub.f64 fd80, fd62, fd70;
add.f64 fd81, fd65, fd73;
add.f64 fd82, fd66, fd74;
sub.f64 fd83, fd65, fd73;
sub.f64 fd84, fd66, fd74;
add.f64 %1, fd78, fd82;
add.f64 %0, fd77, fd81;
add.f64 %3, fd80, fd83;
sub.f64 %2, fd79, fd84;
sub.f64 %5, fd78, fd82;
sub.f64 %4, fd77, fd81;
sub.f64 %7, fd80, fd83;
add.f64 %6, fd79, fd84;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<602, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<14>;
.reg .f64 fd<85>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 7;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd17, %10, %15;
add.f64 fd18, %11, %17;
sub.f64 fd19, %10, %15;
sub.f64 fd20, %11, %17;
add.f64 fd21, %12, %18;
add.f64 fd22, %14, %19;
sub.f64 fd23, %12, %18;
sub.f64 fd24, %14, %19;
add.f64 fd25, fd17, fd21;
add.f64 fd26, fd18, fd22;
sub.f64 fd27, fd17, fd21;
sub.f64 fd28, fd18, fd22;
sub.f64 fd29, fd19, fd24;
add.f64 fd30, fd20, fd23;
add.f64 fd31, fd19, fd24;
sub.f64 fd32, fd20, fd23;
and.b32 r6, r5, 3;
shl.b32 r7, r5, 4;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 48;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd33, fd34}, [rd5];
mul.f64 fd37, fd30, fd34;
fma.rn.f64 fd38, fd33, fd29, fd37;
mul.f64 fd39, fd29, fd34;
mul.f64 fd40, fd33, fd30;
sub.f64 fd41, fd40, fd39;
mul.f64 fd42, fd33, fd33;
mul.f64 fd43, fd34, fd34;
sub.f64 fd44, fd42, fd43;
mul.f64 fd45, fd34, fd33;
fma.rn.f64 fd46, fd34, fd33, fd45;
mul.f64 fd47, fd28, fd46;
fma.rn.f64 fd48, fd44, fd27, fd47;
mul.f64 fd49, fd27, fd46;
mul.f64 fd50, fd44, fd28;
sub.f64 fd51, fd50, fd49;
ld.global.v2.f64 {fd52, fd53}, [rd5+64];
mul.f64 fd56, fd32, fd53;
fma.rn.f64 fd57, fd52, fd31, fd56;
mul.f64 fd58, fd31, fd53;
mul.f64 fd59, fd52, fd32;
sub.f64 fd60, fd59, fd58;
shl.b32 r8, r5, 5;
and.b32 r9, r8, -128;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 96;
add.s32 r12, r10, r11;
st.shared.v2.f64 [r12], {fd25, fd38};
st.shared.v2.f64 [r12+16], {fd48, fd57};
barrier.sync 0;
mad.lo.s32 r13, r6, -24, r12;
ld.shared.f64 fd61, [r13];
ld.shared.f64 fd62, [r13+32];
ld.shared.f64 fd63, [r13+64];
ld.shared.f64 fd64, [r13+96];
barrier.sync 0;
st.shared.v2.f64 [r12], {fd26, fd41};
st.shared.v2.f64 [r12+16], {fd51, fd60};
barrier.sync 0;
ld.shared.f64 fd65, [r13];
ld.shared.f64 fd66, [r13+32];
ld.shared.f64 fd67, [r13+64];
ld.shared.f64 fd68, [r13+96];
add.f64 fd69, fd61, fd63;
add.f64 fd70, fd65, fd67;
sub.f64 fd71, fd61, fd63;
sub.f64 fd72, fd65, fd67;
add.f64 fd73, fd62, fd64;
add.f64 fd74, fd66, fd68;
sub.f64 fd75, fd62, fd64;
sub.f64 fd76, fd66, fd68;
add.f64 %0, fd69, fd73;
add.f64 %1, fd70, fd74;
add.f64 %3, fd72, fd75;
sub.f64 %2, fd71, fd76;
sub.f64 %4, fd69, fd73;
sub.f64 %5, fd70, fd74;
sub.f64 %7, fd72, fd75;
add.f64 %6, fd71, fd76;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<603, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<28>;
.reg .f64 fd<76>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 8;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
sub.f64 fd9, %8, %10;
sub.f64 fd10, %9, %11;
shl.b32 r6, r5, 5;
and.b32 r7, r6, -256;
add.s32 r8, r4, r7;
shl.b32 r9, r5, 4;
cvt.u64.u32 rd2, r9;
and.b64 rd3, rd2, 112;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd11, fd12}, [rd5];
mul.f64 fd15, fd10, fd12;
mul.f64 fd16, fd9, fd12;
mul.f64 fd17, fd11, fd10;
barrier.sync 0;
and.b32 r10, r6, 224;
add.s32 r11, r8, r10;
add.f64 fd18, %9, %11;
add.f64 fd19, %8, %10;
st.shared.v2.f64 [r11], {fd19, fd18};
sub.f64 fd20, fd17, fd16;
fma.rn.f64 fd21, fd11, fd9, fd15;
st.shared.v2.f64 [r11+16], {fd21, fd20};
barrier.sync 0;
and.b32 r12, r9, 112;
sub.s32 r13, r11, r12;
ld.shared.v2.f64 {fd22, fd23}, [r13];
ld.shared.v2.f64 {fd26, fd27}, [r13+128];
sub.f64 fd30, fd22, fd26;
sub.f64 fd31, fd23, fd27;
bfe.u32 r14, r5, 1, 2;
mul.wide.u32 rd6, r14, 16;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f64 {fd32, fd33}, [rd8];
mul.f64 fd36, fd31, fd33;
mul.f64 fd37, fd30, fd33;
mul.f64 fd38, fd32, fd31;
and.b32 r15, r9, 16;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 192;
add.s32 r18, r16, r17;
add.f64 fd39, fd23, fd27;
add.f64 fd40, fd22, fd26;
st.shared.v2.f64 [r18], {fd40, fd39};
fma.rn.f64 fd41, fd32, fd30, fd36;
sub.f64 fd42, fd38, fd37;
st.shared.v2.f64 [r18+32], {fd41, fd42};
barrier.sync 0;
and.b32 r19, r9, 96;
sub.s32 r20, r18, r19;
ld.shared.v2.f64 {fd43, fd44}, [r20];
ld.shared.v2.f64 {fd47, fd48}, [r20+128];
sub.f64 fd51, fd43, fd47;
sub.f64 fd52, fd44, fd48;
bfe.u32 r21, r5, 2, 1;
mul.wide.u32 rd9, r21, 16;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f64 {fd53, fd54}, [rd11];
mul.f64 fd57, fd52, fd54;
mul.f64 fd58, fd51, fd54;
mul.f64 fd59, fd53, fd52;
and.b32 r22, r9, 48;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 128;
add.s32 r25, r23, r24;
add.f64 fd60, fd44, fd48;
add.f64 fd61, fd43, fd47;
st.shared.v2.f64 [r25], {fd61, fd60};
fma.rn.f64 fd62, fd53, fd51, fd57;
sub.f64 fd63, fd59, fd58;
st.shared.v2.f64 [r25+64], {fd62, fd63};
barrier.sync 0;
and.b32 r26, r9, 64;
sub.s32 r27, r25, r26;
ld.shared.v2.f64 {fd64, fd65}, [r27];
ld.shared.v2.f64 {fd68, fd69}, [r27+128];
add.f64 %1, fd65, fd69;
add.f64 %0, fd64, fd68;
sub.f64 %3, fd65, fd69;
sub.f64 %2, fd64, fd68;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<605, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<14>;
.reg .f64 fd<206>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 8;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd33, %18, %28;
add.f64 fd34, %19, %30;
sub.f64 fd35, %18, %28;
sub.f64 fd36, %19, %30;
add.f64 fd37, %23, %34;
add.f64 fd38, %25, %35;
sub.f64 fd39, %23, %34;
sub.f64 fd40, %25, %35;
add.f64 fd41, fd33, fd37;
add.f64 fd42, fd34, fd38;
sub.f64 fd43, fd33, fd37;
sub.f64 fd44, fd34, fd38;
sub.f64 fd45, fd35, fd40;
add.f64 fd46, fd36, fd39;
add.f64 fd47, fd35, fd40;
sub.f64 fd48, fd36, fd39;
add.f64 fd49, %20, %31;
add.f64 fd50, %22, %33;
sub.f64 fd51, %20, %31;
sub.f64 fd52, %22, %33;
add.f64 fd53, %26, %36;
add.f64 fd54, %27, %37;
sub.f64 fd55, %26, %36;
sub.f64 fd56, %27, %37;
add.f64 fd57, fd49, fd53;
add.f64 fd58, fd50, fd54;
sub.f64 fd59, fd49, fd53;
sub.f64 fd60, fd50, fd54;
sub.f64 fd61, fd51, fd56;
add.f64 fd62, fd52, fd55;
add.f64 fd63, fd51, fd56;
sub.f64 fd64, fd52, fd55;
mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD;
mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD;
sub.f64 fd67, fd65, fd66;
add.f64 fd68, fd65, fd66;
mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD;
mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD;
sub.f64 fd71, fd69, fd70;
mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72;
sub.f64 fd74, fd41, fd57;
sub.f64 fd75, fd42, fd58;
add.f64 fd76, fd45, fd67;
add.f64 fd77, fd46, fd68;
sub.f64 fd78, fd45, fd67;
sub.f64 fd79, fd46, fd68;
sub.f64 fd80, fd43, fd60;
add.f64 fd81, fd44, fd59;
add.f64 fd82, fd43, fd60;
sub.f64 fd83, fd44, fd59;
add.f64 fd84, fd47, fd71;
add.f64 fd85, fd48, fd73;
sub.f64 fd86, fd47, fd71;
sub.f64 fd87, fd48, fd73;
and.b32 r6, r5, 1;
shl.b32 r7, r5, 7;
and.b32 r8, r7, -256;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 4;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 16;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd88, fd89}, [rd5];
mul.f64 fd92, fd77, fd89;
mul.f64 fd93, fd76, fd89;
mul.f64 fd94, fd88, fd77;
mul.f64 fd95, fd88, fd88;
mul.f64 fd96, fd89, fd89;
sub.f64 fd97, fd95, fd96;
mul.f64 fd98, fd89, fd88;
fma.rn.f64 fd99, fd89, fd88, fd98;
mul.f64 fd100, fd81, fd99;
mul.f64 fd101, fd80, fd99;
mul.f64 fd102, fd97, fd81;
mul.f64 fd103, fd88, fd97;
mul.f64 fd104, fd89, fd99;
sub.f64 fd105, fd103, fd104;
mul.f64 fd106, fd88, fd99;
fma.rn.f64 fd107, fd89, fd97, fd106;
mul.f64 fd108, fd85, fd107;
mul.f64 fd109, fd84, fd107;
mul.f64 fd110, fd105, fd85;
mul.f64 fd111, fd88, fd105;
mul.f64 fd112, fd89, fd107;
sub.f64 fd113, fd111, fd112;
mul.f64 fd114, fd88, fd107;
fma.rn.f64 fd115, fd89, fd105, fd114;
mul.f64 fd116, fd75, fd115;
mul.f64 fd117, fd74, fd115;
mul.f64 fd118, fd113, fd75;
ld.global.v2.f64 {fd119, fd120}, [rd5+32];
mul.f64 fd123, fd79, fd120;
mul.f64 fd124, fd78, fd120;
mul.f64 fd125, fd119, fd79;
mul.f64 fd126, fd88, fd119;
mul.f64 fd127, fd89, fd120;
sub.f64 fd128, fd126, fd127;
mul.f64 fd129, fd88, fd120;
fma.rn.f64 fd130, fd89, fd119, fd129;
mul.f64 fd131, fd83, fd130;
mul.f64 fd132, fd82, fd130;
mul.f64 fd133, fd128, fd83;
mul.f64 fd134, fd88, fd128;
mul.f64 fd135, fd89, fd130;
sub.f64 fd136, fd134, fd135;
mul.f64 fd137, fd88, fd130;
fma.rn.f64 fd138, fd89, fd128, fd137;
mul.f64 fd139, fd87, fd138;
mul.f64 fd140, fd86, fd138;
mul.f64 fd141, fd136, fd87;
barrier.sync 0;
and.b32 r11, r7, 128;
add.s32 r12, r9, r11;
add.f64 fd142, fd42, fd58;
add.f64 fd143, fd41, fd57;
st.shared.v2.f64 [r12], {fd143, fd142};
fma.rn.f64 fd144, fd88, fd76, fd92;
sub.f64 fd145, fd94, fd93;
st.shared.v2.f64 [r12+16], {fd144, fd145};
fma.rn.f64 fd146, fd97, fd80, fd100;
sub.f64 fd147, fd102, fd101;
st.shared.v2.f64 [r12+32], {fd146, fd147};
sub.f64 fd148, fd110, fd109;
fma.rn.f64 fd149, fd105, fd84, fd108;
st.shared.v2.f64 [r12+48], {fd149, fd148};
fma.rn.f64 fd150, fd113, fd74, fd116;
sub.f64 fd151, fd118, fd117;
st.shared.v2.f64 [r12+64], {fd150, fd151};
fma.rn.f64 fd152, fd119, fd78, fd123;
sub.f64 fd153, fd125, fd124;
st.shared.v2.f64 [r12+80], {fd152, fd153};
fma.rn.f64 fd154, fd128, fd82, fd131;
sub.f64 fd155, fd133, fd132;
st.shared.v2.f64 [r12+96], {fd154, fd155};
sub.f64 fd156, fd141, fd140;
fma.rn.f64 fd157, fd136, fd86, fd139;
st.shared.v2.f64 [r12+112], {fd157, fd156};
barrier.sync 0;
mad.lo.s32 r13, r6, -112, r12;
ld.shared.v2.f64 {fd158, fd159}, [r13];
ld.shared.v2.f64 {fd162, fd163}, [r13+32];
ld.shared.v2.f64 {fd166, fd167}, [r13+64];
ld.shared.v2.f64 {fd170, fd171}, [r13+96];
ld.shared.v2.f64 {fd174, fd175}, [r13+128];
ld.shared.v2.f64 {fd178, fd179}, [r13+160];
ld.shared.v2.f64 {fd182, fd183}, [r13+192];
ld.shared.v2.f64 {fd186, fd187}, [r13+224];
add.f64 %1, fd159, fd175;
add.f64 %0, fd158, fd174;
add.f64 %3, fd163, fd179;
add.f64 %2, fd162, fd178;
add.f64 %5, fd167, fd183;
add.f64 %4, fd166, fd182;
add.f64 %7, fd171, fd187;
add.f64 %6, fd170, fd186;
sub.f64 %9, fd159, fd175;
sub.f64 %8, fd158, fd174;
sub.f64 %11, fd163, fd179;
sub.f64 %10, fd162, fd178;
sub.f64 %13, fd167, fd183;
sub.f64 %12, fd166, fd182;
sub.f64 %15, fd171, fd187;
sub.f64 %14, fd170, fd186;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<604, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<14>;
.reg .f64 fd<190>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 7;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd33, %18, %28;
add.f64 fd34, %19, %30;
sub.f64 fd35, %18, %28;
sub.f64 fd36, %19, %30;
add.f64 fd37, %23, %34;
add.f64 fd38, %25, %35;
sub.f64 fd39, %23, %34;
sub.f64 fd40, %25, %35;
add.f64 fd41, fd33, fd37;
add.f64 fd42, fd34, fd38;
sub.f64 fd43, fd33, fd37;
sub.f64 fd44, fd34, fd38;
sub.f64 fd45, fd35, fd40;
add.f64 fd46, fd36, fd39;
add.f64 fd47, fd35, fd40;
sub.f64 fd48, fd36, fd39;
add.f64 fd49, %20, %31;
add.f64 fd50, %22, %33;
sub.f64 fd51, %20, %31;
sub.f64 fd52, %22, %33;
add.f64 fd53, %26, %36;
add.f64 fd54, %27, %37;
sub.f64 fd55, %26, %36;
sub.f64 fd56, %27, %37;
add.f64 fd57, fd49, fd53;
add.f64 fd58, fd50, fd54;
sub.f64 fd59, fd49, fd53;
sub.f64 fd60, fd50, fd54;
sub.f64 fd61, fd51, fd56;
add.f64 fd62, fd52, fd55;
add.f64 fd63, fd51, fd56;
sub.f64 fd64, fd52, fd55;
mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD;
mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD;
sub.f64 fd67, fd65, fd66;
add.f64 fd68, fd65, fd66;
mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD;
mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD;
sub.f64 fd71, fd69, fd70;
mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD;
fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72;
add.f64 fd74, fd41, fd57;
add.f64 fd75, fd42, fd58;
sub.f64 fd76, fd41, fd57;
sub.f64 fd77, fd42, fd58;
add.f64 fd78, fd45, fd67;
add.f64 fd79, fd46, fd68;
sub.f64 fd80, fd45, fd67;
sub.f64 fd81, fd46, fd68;
sub.f64 fd82, fd43, fd60;
add.f64 fd83, fd44, fd59;
add.f64 fd84, fd43, fd60;
sub.f64 fd85, fd44, fd59;
add.f64 fd86, fd47, fd71;
add.f64 fd87, fd48, fd73;
sub.f64 fd88, fd47, fd71;
sub.f64 fd89, fd48, fd73;
and.b32 r6, r5, 1;
shl.b32 r7, r5, 4;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 16;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd90, fd91}, [rd5];
mul.f64 fd94, fd79, fd91;
fma.rn.f64 fd95, fd90, fd78, fd94;
mul.f64 fd96, fd78, fd91;
mul.f64 fd97, fd90, fd79;
sub.f64 fd98, fd97, fd96;
mul.f64 fd99, fd90, fd90;
mul.f64 fd100, fd91, fd91;
sub.f64 fd101, fd99, fd100;
mul.f64 fd102, fd91, fd90;
fma.rn.f64 fd103, fd91, fd90, fd102;
mul.f64 fd104, fd83, fd103;
fma.rn.f64 fd105, fd101, fd82, fd104;
mul.f64 fd106, fd82, fd103;
mul.f64 fd107, fd101, fd83;
sub.f64 fd108, fd107, fd106;
mul.f64 fd109, fd90, fd101;
mul.f64 fd110, fd91, fd103;
sub.f64 fd111, fd109, fd110;
mul.f64 fd112, fd90, fd103;
fma.rn.f64 fd113, fd91, fd101, fd112;
mul.f64 fd114, fd87, fd113;
fma.rn.f64 fd115, fd111, fd86, fd114;
mul.f64 fd116, fd86, fd113;
mul.f64 fd117, fd111, fd87;
sub.f64 fd118, fd117, fd116;
mul.f64 fd119, fd90, fd111;
mul.f64 fd120, fd91, fd113;
sub.f64 fd121, fd119, fd120;
mul.f64 fd122, fd90, fd113;
fma.rn.f64 fd123, fd91, fd111, fd122;
mul.f64 fd124, fd77, fd123;
fma.rn.f64 fd125, fd121, fd76, fd124;
mul.f64 fd126, fd76, fd123;
mul.f64 fd127, fd121, fd77;
sub.f64 fd128, fd127, fd126;
ld.global.v2.f64 {fd129, fd130}, [rd5+32];
mul.f64 fd133, fd81, fd130;
fma.rn.f64 fd134, fd129, fd80, fd133;
mul.f64 fd135, fd80, fd130;
mul.f64 fd136, fd129, fd81;
sub.f64 fd137, fd136, fd135;
mul.f64 fd138, fd90, fd129;
mul.f64 fd139, fd91, fd130;
sub.f64 fd140, fd138, fd139;
mul.f64 fd141, fd90, fd130;
fma.rn.f64 fd142, fd91, fd129, fd141;
mul.f64 fd143, fd85, fd142;
fma.rn.f64 fd144, fd140, fd84, fd143;
mul.f64 fd145, fd84, fd142;
mul.f64 fd146, fd140, fd85;
sub.f64 fd147, fd146, fd145;
mul.f64 fd148, fd90, fd140;
mul.f64 fd149, fd91, fd142;
sub.f64 fd150, fd148, fd149;
mul.f64 fd151, fd90, fd142;
fma.rn.f64 fd152, fd91, fd140, fd151;
mul.f64 fd153, fd89, fd152;
fma.rn.f64 fd154, fd150, fd88, fd153;
mul.f64 fd155, fd88, fd152;
mul.f64 fd156, fd150, fd89;
sub.f64 fd157, fd156, fd155;
shl.b32 r8, r5, 6;
and.b32 r9, r8, -128;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 64;
add.s32 r12, r10, r11;
st.shared.v2.f64 [r12], {fd74, fd95};
st.shared.v2.f64 [r12+16], {fd105, fd115};
st.shared.v2.f64 [r12+32], {fd125, fd134};
st.shared.v2.f64 [r12+48], {fd144, fd154};
barrier.sync 0;
mad.lo.s32 r13, r6, -56, r12;
ld.shared.f64 fd158, [r13];
ld.shared.f64 fd159, [r13+16];
ld.shared.f64 fd160, [r13+32];
ld.shared.f64 fd161, [r13+48];
ld.shared.f64 fd162, [r13+64];
ld.shared.f64 fd163, [r13+80];
ld.shared.f64 fd164, [r13+96];
ld.shared.f64 fd165, [r13+112];
barrier.sync 0;
st.shared.v2.f64 [r12], {fd75, fd98};
st.shared.v2.f64 [r12+16], {fd108, fd118};
st.shared.v2.f64 [r12+32], {fd128, fd137};
st.shared.v2.f64 [r12+48], {fd147, fd157};
barrier.sync 0;
ld.shared.f64 fd166, [r13];
ld.shared.f64 fd167, [r13+16];
ld.shared.f64 fd168, [r13+32];
ld.shared.f64 fd169, [r13+48];
ld.shared.f64 fd170, [r13+64];
ld.shared.f64 fd171, [r13+80];
ld.shared.f64 fd172, [r13+96];
ld.shared.f64 fd173, [r13+112];
add.f64 %0, fd158, fd162;
add.f64 %1, fd166, fd170;
add.f64 %2, fd159, fd163;
add.f64 %3, fd167, fd171;
add.f64 %4, fd160, fd164;
add.f64 %5, fd168, fd172;
add.f64 %6, fd161, fd165;
add.f64 %7, fd169, fd173;
sub.f64 %8, fd158, fd162;
sub.f64 %9, fd166, fd170;
sub.f64 %10, fd159, fd163;
sub.f64 %11, fd167, fd171;
sub.f64 %12, fd160, fd164;
sub.f64 %13, fd168, fd172;
sub.f64 %14, fd161, fd165;
sub.f64 %15, fd169, fd173;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<606, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<28>;
.reg .f64 fd<64>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 7;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f64 fd9, %8, %10;
add.f64 fd10, %9, %11;
sub.f64 fd11, %8, %10;
sub.f64 fd12, %9, %11;
shl.b32 r6, r5, 4;
cvt.u64.u32 rd2, r6;
and.b64 rd3, rd2, 112;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f64 {fd13, fd14}, [rd5];
mul.f64 fd17, fd12, fd14;
fma.rn.f64 fd18, fd13, fd11, fd17;
mul.f64 fd19, fd11, fd14;
mul.f64 fd20, fd13, fd12;
sub.f64 fd21, fd20, fd19;
and.b32 r7, r6, -128;
add.s32 r8, r4, r7;
barrier.sync 0;
and.b32 r9, r6, 112;
add.s32 r10, r8, r9;
st.shared.v2.f64 [r10], {fd9, fd18};
barrier.sync 0;
shl.b32 r11, r5, 3;
and.b32 r12, r11, 56;
sub.s32 r13, r10, r12;
ld.shared.f64 fd22, [r13];
ld.shared.f64 fd23, [r13+64];
barrier.sync 0;
st.shared.v2.f64 [r10], {fd10, fd21};
barrier.sync 0;
ld.shared.f64 fd24, [r13];
ld.shared.f64 fd25, [r13+64];
add.f64 fd26, fd22, fd23;
add.f64 fd27, fd24, fd25;
sub.f64 fd28, fd22, fd23;
sub.f64 fd29, fd24, fd25;
bfe.u32 r14, r5, 1, 2;
mul.wide.u32 rd6, r14, 16;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f64 {fd30, fd31}, [rd8];
mul.f64 fd34, fd29, fd31;
fma.rn.f64 fd35, fd30, fd28, fd34;
mul.f64 fd36, fd28, fd31;
mul.f64 fd37, fd30, fd29;
sub.f64 fd38, fd37, fd36;
and.b32 r15, r11, 8;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 96;
add.s32 r18, r16, r17;
st.shared.f64 [r18], fd26;
st.shared.f64 [r18+16], fd35;
barrier.sync 0;
and.b32 r19, r11, 48;
sub.s32 r20, r18, r19;
ld.shared.f64 fd39, [r20];
ld.shared.f64 fd40, [r20+64];
barrier.sync 0;
st.shared.f64 [r18], fd27;
st.shared.f64 [r18+16], fd38;
barrier.sync 0;
ld.shared.f64 fd41, [r20];
ld.shared.f64 fd42, [r20+64];
add.f64 fd43, fd39, fd40;
add.f64 fd44, fd41, fd42;
sub.f64 fd45, fd39, fd40;
sub.f64 fd46, fd41, fd42;
bfe.u32 r21, r5, 2, 1;
mul.wide.u32 rd9, r21, 16;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f64 {fd47, fd48}, [rd11];
mul.f64 fd51, fd46, fd48;
fma.rn.f64 fd52, fd47, fd45, fd51;
mul.f64 fd53, fd45, fd48;
mul.f64 fd54, fd47, fd46;
sub.f64 fd55, fd54, fd53;
and.b32 r22, r11, 24;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 64;
add.s32 r25, r23, r24;
st.shared.f64 [r25], fd43;
st.shared.f64 [r25+32], fd52;
barrier.sync 0;
and.b32 r26, r11, 32;
sub.s32 r27, r25, r26;
ld.shared.f64 fd56, [r27];
ld.shared.f64 fd57, [r27+64];
barrier.sync 0;
st.shared.f64 [r25], fd44;
st.shared.f64 [r25+32], fd55;
barrier.sync 0;
ld.shared.f64 fd58, [r27];
ld.shared.f64 fd59, [r27+64];
add.f64 %0, fd56, fd57;
add.f64 %1, fd58, fd59;
sub.f64 %2, fd56, fd57;
sub.f64 %3, fd58, fd59;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y));
};


#endif
